{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.4/dist-packages/pandas/io/excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.\n",
      "  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"Adult\")\n",
    "adult_filename = os.path.join(data_folder, \"adult.data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "adult = pd.read_csv(adult_filename, header=None, names=[\"Age\", \"Work-Class\", \"fnlwgt\", \"Education\",\n",
    "                                                        \"Education-Num\", \"Marital-Status\", \"Occupation\",\n",
    "                                                        \"Relationship\", \"Race\", \"Sex\", \"Capital-gain\",\n",
    "                                                        \"Capital-loss\", \"Hours-per-week\", \"Native-Country\",\n",
    "                                                        \"Earnings-Raw\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "adult.dropna(how='all', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Age', 'Work-Class', 'fnlwgt', 'Education', 'Education-Num', 'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native-Country', 'Earnings-Raw'], dtype='object')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adult.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    32561.000000\n",
       "mean        40.437456\n",
       "std         12.347429\n",
       "min          1.000000\n",
       "25%         40.000000\n",
       "50%         40.000000\n",
       "75%         45.000000\n",
       "max         99.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adult[\"Hours-per-week\"].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10.0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adult[\"Education-Num\"].median()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',\n",
       "       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay', ' Never-worked'], dtype=object)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adult[\"Work-Class\"].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "X = np.arange(30).reshape((10, 3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [ 3,  4,  5],\n",
       "       [ 6,  7,  8],\n",
       "       [ 9, 10, 11],\n",
       "       [12, 13, 14],\n",
       "       [15, 16, 17],\n",
       "       [18, 19, 20],\n",
       "       [21, 22, 23],\n",
       "       [24, 25, 26],\n",
       "       [27, 28, 29]])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X[:,1] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [ 3,  1,  5],\n",
       "       [ 6,  1,  8],\n",
       "       [ 9,  1, 11],\n",
       "       [12,  1, 14],\n",
       "       [15,  1, 17],\n",
       "       [18,  1, 20],\n",
       "       [21,  1, 23],\n",
       "       [24,  1, 26],\n",
       "       [27,  1, 29]])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_selection import VarianceThreshold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "vt = VarianceThreshold()\n",
    "Xt = vt.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  2],\n",
       "       [ 3,  5],\n",
       "       [ 6,  8],\n",
       "       [ 9, 11],\n",
       "       [12, 14],\n",
       "       [15, 17],\n",
       "       [18, 20],\n",
       "       [21, 23],\n",
       "       [24, 26],\n",
       "       [27, 29]])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Xt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 74.25   0.    74.25]\n"
     ]
    }
   ],
   "source": [
    "print(vt.variances_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X = adult[[\"Age\", \"Education-Num\", \"Capital-gain\", \"Capital-loss\", \"Hours-per-week\"]].values\n",
    "y = (adult[\"Earnings-Raw\"] == ' >50K').values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import chi2\n",
    "transformer = SelectKBest(score_func=chi2, k=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[  8.60061182e+03   2.40142178e+03   8.21924671e+07   1.37214589e+06\n",
      "   6.47640900e+03]\n"
     ]
    }
   ],
   "source": [
    "Xt_chi2 = transformer.fit_transform(X, y)\n",
    "print(transformer.scores_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from scipy.stats import pearsonr\n",
    "\n",
    "def multivariate_pearsonr(X, y):\n",
    "    scores, pvalues = [], []\n",
    "    for column in range(X.shape[1]):\n",
    "        cur_score, cur_p = pearsonr(X[:,column], y)\n",
    "        scores.append(abs(cur_score))\n",
    "        pvalues.append(cur_p)\n",
    "    return (np.array(scores), np.array(pvalues))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.2340371   0.33515395  0.22332882  0.15052631  0.22968907]\n"
     ]
    }
   ],
   "source": [
    "transformer = SelectKBest(score_func=multivariate_pearsonr, k=3)\n",
    "Xt_pearson = transformer.fit_transform(X, y)\n",
    "print(transformer.scores_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.cross_validation import cross_val_score\n",
    "clf = DecisionTreeClassifier(random_state=14)\n",
    "scores_chi2 = cross_val_score(clf, Xt_chi2, y, scoring='accuracy')\n",
    "scores_pearson = cross_val_score(clf, Xt_pearson, y, scoring='accuracy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Chi2 performance: 0.829\n",
      "Pearson performance: 0.771\n"
     ]
    }
   ],
   "source": [
    "print(\"Chi2 performance: {0:.3f}\".format(scores_chi2.mean()))\n",
    "print(\"Pearson performance: {0:.3f}\".format(scores_pearson.mean()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.base import TransformerMixin\n",
    "from sklearn.utils import as_float_array\n",
    "\n",
    "class MeanDiscrete(TransformerMixin):\n",
    "    def fit(self, X, y=None):\n",
    "        X = as_float_array(X)\n",
    "        self.mean = np.mean(X, axis=0)\n",
    "        return self\n",
    "\n",
    "    def transform(self, X):\n",
    "        X = as_float_array(X)\n",
    "        assert X.shape[1] == self.mean.shape[0]\n",
    "        return X > self.mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "mean_discrete = MeanDiscrete()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_mean = mean_discrete.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Overwriting adult_tests.py\n"
     ]
    }
   ],
   "source": [
    "%%file adult_tests.py\n",
    "import numpy as np\n",
    "from numpy.testing import assert_array_equal\n",
    "\n",
    "def test_meandiscrete():\n",
    "    X_test = np.array([[ 0,  2],\n",
    "                        [ 3,  5],\n",
    "                        [ 6,  8],\n",
    "                        [ 9, 11],\n",
    "                        [12, 14],\n",
    "                        [15, 17],\n",
    "                        [18, 20],\n",
    "                        [21, 23],\n",
    "                        [24, 26],\n",
    "                        [27, 29]])\n",
    "    mean_discrete = MeanDiscrete()\n",
    "    mean_discrete.fit(X_test)\n",
    "    assert_array_equal(mean_discrete.mean, np.array([13.5, 15.5]))\n",
    "    X_transformed = mean_discrete.transform(X_test)\n",
    "    X_expected = np.array([[ 0,  0],\n",
    "                            [ 0, 0],\n",
    "                            [ 0, 0],\n",
    "                            [ 0, 0],\n",
    "                            [ 0, 0],\n",
    "                            [ 1, 1],\n",
    "                            [ 1, 1],\n",
    "                            [ 1, 1],\n",
    "                            [ 1, 1],\n",
    "                            [ 1, 1]])\n",
    "    assert_array_equal(X_transformed, X_expected)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'test_meandiscrete' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-29-cbf134d89aa0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mtest_meandiscrete\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m: name 'test_meandiscrete' is not defined"
     ]
    }
   ],
   "source": [
    "test_meandiscrete()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "pipeline = Pipeline([('mean_discrete', MeanDiscrete()),\n",
    "                     ('classifier', DecisionTreeClassifier(random_state=14))])\n",
    "scores_mean_discrete = cross_val_score(pipeline, X, y, scoring='accuracy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(\"Mean Discrete performance: {0:.3f}\".format(scores_mean_discrete.mean()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
